fct_lump异常值处理,
fct_lump分箱使用方法reg:gamma in Pythonknitr::opts_chunk$set(warning = FALSE, message = FALSE, cache = T)
library(tidyverse)
library(knitr)
library(formattable)
library(skimr)
library(DT)
library(readxl)
library(xgboost)
library(SmartEDA)
library(DT)
library(tidyquant)
library(pryr)
get_path <- function(x){file.path(getwd(),"required_data",x)}
theme_ilo <- function(){
theme_minimal() +
theme(
# text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 9),
plot.caption = element_text(color = "gray30"),
# plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm"),
axis.title.x = element_text(size=12,face = "bold"),
axis.title.y = element_text(size=12,face = "bold"),
# x和y的命名,要加粗,ppt才好看
axis.text.x = element_text(size=7, angle = 70, hjust = 1),
# 控制axis字体大小,7号大小最好
axis.text.y = element_text(size=7),
legend.title=element_blank()
)
}
train <- read_csv(get_path("train.csv")) %>%
rename(v1stFlrSF = `1stFlrSF`,
v2ndFlrSF = `2ndFlrSF`,
v3SsnPorch = `3SsnPorch`)
test <- read_csv(get_path("test.csv")) %>%
rename(v1stFlrSF = `1stFlrSF`,
v2ndFlrSF = `2ndFlrSF`,
v3SsnPorch = `3SsnPorch`)
eda_data <- train %>% bind_rows(test)变量进行了重命名1。
EDA 主要参考 SmartEDA 包 测评和 Tidyverse使用技巧。
ExpData(data=train,type=1) %>% datatable()ExpData(data=train,type=2) %>% datatable()ExpNumStat(train,by="A",gp="SalePrice",Qnt=seq(0,1,0.1),MesofShape=2,Outlier=TRUE,round=2) %>%
mutate_at(vars(Per_of_Missing),percent) %>% datatable()Vname – Variable name - 变量名称Group – Target variable -TN – Total sample (inculded NA observations) - 样本总数nNeg – Total negative observations - 负样本数量nZero – Total zero observations - 零值数量nPos – Total positive observations - 正样本数量NegInf – Negative infinite count - 负无穷大极值PosInf – Positive infinite count - 正无穷大极值NA_value – Not Applicable count - 缺失值Per_of_Missing – Percentage of missings - 缺失率Min – minimum value - 最小值Max – maximum value - 最大值Mean – average value - 平均值Median – median value - 中位数SD – Standard deviation - 总体标准差CV – coefficient of variations (SD/mean)*100 - z scoreIQR – Inter quartile range - 四分位距 \(QD = \frac{Q3-Q1}{2}\)Qnt – Specified quantiles - 百分位点MesofShape – Skewness and Kurtosis - 偏度和峰度Outlier – Number of outliers - 异常值数量Cor – Correlation b/w target and independent variables - 自变量和因变量相关性ExpNumViz(train,gp=NULL,nlim=10,Page=c(2,2),sample=NULL)## $`0`
nlim: 少于10个样本不画图sample: 随机选择变量进行展示,NULL表示全部展示ExpNumViz(train,gp="SalePrice",nlim=4,fname=NULL,col=NULL,Page=c(2,2))## $`0`
ExpCTable(train,Target="SalePrice",margin=1,clim=10,nlim=NULL,round=2,bin=4,per=F) %>% datatable()ExpCatViz(train,gp=NULL,fname=NULL,clim=10,col=NULL,margin=2,Page = c(2,1))## $`0`
dashboard <-
bind_rows(
skim_to_wide(train) %>% mutate(tag = 'train'),
skim_to_wide(test) %>% mutate(tag = 'test')
) %>%
left_join(comment, by = c("variable"="var")) %>%
select(tag,hist, everything())
dashboard %>% datatable()skim函数的展示结果参考 Stack Overflowmagicyang这个表格比较大,因此建议将网页向右拖拽,即可查看相关的histogram图。
dashboard %>%
mutate(missing = as.integer(missing)) %>%
top_n(20,missing) %>%
distinct(variable)## # A tibble: 11 x 1
## variable
## <chr>
## 1 Alley
## 2 Fence
## 3 FireplaceQu
## 4 GarageCond
## 5 GarageFinish
## 6 GarageQual
## 7 GarageType
## 8 MiscFeature
## 9 PoolQC
## 10 GarageYrBlt
## 11 LotFrontage
setdiff(
train %>% names(),
test %>% names()
)## [1] "SalePrice"
train %>%
ggplot(aes(x = SalePrice)) +
geom_freqpoly() +
scale_x_continuous(labels = c('0','200k','400k','600k','800k')) +
theme_ilo() +
labs(
title = "pdf on SalePrice",
subtitle = "positive skewness",
caption = "Jiaxiang Li - jiaxiangli.netlify.com"
)train_xgb <- get_watchlist(train)train_xgb_mod <- xgb.train(
data = train_xgb$dtrain,
# 1
eta = 0.1,
nround = 2000,
# 2
max_depth = 7,
min_child_weight = 17,
gamma = 0.72,
# 3
subsample = 0.8,
colsample_bytree = 0.95,
# 评价标准
# eval.metric = "error",
eval.metric = "rmse",
# eval.metric = ks_value,
# eval.metric = "auc",
# eval.metric = "logloss",
# objective
objective = "reg:linear", # 这是一个回归问题
# 其他
seed = 596,
watchlist = train_xgb$watchlist,
# 300万数据一起用!
nfold = 2,
early.stop = 50,
nthread = 8
)xgb.save(train_xgb_mod, file.path("required_data","train_xgb_mod.model"))train_xgb_mod <- xgb.load(file.path("required_data","train_xgb_mod.model"))dtest <- xgb.DMatrix(data = data.matrix(test %>% select(-Id)))sm_ljx_180525 <-
tibble(
Id = test$Id,
SalePrice = predict(train_xgb_mod, dtest)
) %>%
write_csv(get_path("sm_ljx_180525.csv")) %>%
select(everything())bind_rows(
tibble(
y = train_xgb$train$SalePrice,
yhat = predict(train_xgb_mod, train_xgb$dtrain)
) %>%
mutate(tag = 'train'),
tibble(
y = train_xgb$test$SalePrice,
yhat = predict(train_xgb_mod, train_xgb$dtest)
) %>%
mutate(tag = 'test')
) %>%
mutate(yhat_bin = ntile(yhat,20)) %>%
group_by(tag,yhat_bin) %>%
summarise(y = mean(y), yhat = mean(yhat)) %>%
gather(key,value,y:yhat) %>%
ggplot(aes(x = yhat_bin,y = value, col = key)) +
geom_line() +
facet_wrap(~ tag) +
theme_ilo() +
labs(
x = "predicted binned value",
y = "actual value",
title = "lift curve on train and validation set",
subtitle = "binning by ntile function in ggplot2 package",
caption = "Jiaxiang Li - jiaxiangli.netlify.com"
)theme_ilo函数参考 ggplot2使用技巧。xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(),
model = train_xgb_mod) %>%
xgb.plot.importance()相关口径解释见 xgboost的理解。
Your submission scored 0.14340, which is not an improvement of your best score. Keep trying!
相关推导见 训练模型 training model 使用技巧。
boxcox_lambda <- MASS::boxcox(SalePrice ~ 1, data = train, lambda = seq(-0.25, 0.25, length = 10))boxcox_lambda_best <- boxcox_lambda %>% as.tibble() %>%
filter(y == max(y)) %>%
.$x
boxcox_lambda_best## [1] -0.07828283
SP_cb <-
train %>%
select(SalePrice) %>%
mutate(tag = 'original') %>%
bind_rows(
train %>%
# mutate(SalePrice = log(SalePrice),
# tag = 'transformed') %>%
mutate(comp = SalePrice^boxcox_lambda_best,
SalePrice = comp/(comp-1),
tag = 'transformed') %>%
select(SalePrice,tag)
)
SP_cb %>%
group_by(tag) %>%
summarise(
mean = mean(SalePrice),
sd = sd(SalePrice),
skew = skewness(SalePrice),
kurt = kurtosis(SalePrice)
) %>%
mutate_if(is.double, accounting)## # A tibble: 2 x 5
## tag mean sd skew kurt
## <chr> <S3: formattable> <S3: formattable> <S3: formattable> <S3: >
## 1 original 180,921.20 79,442.50 1.88 6.51
## 2 transformed (0.64) 0.03 (0.19) 1.10
SP_cb %>%
ggplot(aes(x = SalePrice, col = tag)) +
geom_freqpoly() +
facet_wrap(~ tag, scales = "free")我们发现四项指标均有改善,因此转换有效。
train_xgb_trans <- get_watchlist(
train %>%
# mutate(SalePrice = SalePrice^boxcox_lambda_best/(SalePrice^boxcox_lambda_best-1))
mutate(SalePrice = log(SalePrice))
)train_xgb_trans_mod <- xgb.train(
data = train_xgb_trans$dtrain,
# 1
eta = 0.1,
nround = 2000,
# 2
max_depth = 7,
min_child_weight = 17,
gamma = 0.72,
# 3
subsample = 0.8,
colsample_bytree = 0.95,
# 评价标准
# eval.metric = "error",
eval.metric = "rmse",
# eval.metric = ks_value,
# eval.metric = "auc",
# eval.metric = "logloss",
# objective
objective = "reg:linear", # 这是一个回归问题
# 其他
seed = 596,
watchlist = train_xgb_trans$watchlist,
# 300万数据一起用!
nfold = 2,
early.stop = 50,
nthread = 8
)xgb.save(train_xgb_trans_mod, file.path("required_data","train_xgb_trans_mod.model"))train_xgb_trans_mod <- xgb.load(file.path("required_data","train_xgb_trans_mod.model"))dtest <- xgb.DMatrix(data = data.matrix(test %>% select(-Id)))sm_ljx_180527 <-
tibble(
Id = test$Id,
SalePrice = predict(train_xgb_trans_mod, dtest)
) %>%
# mutate(SalePrice = (SalePrice/(SalePrice-1))^(1/boxcox_lambda_best)) %>%
mutate(SalePrice = exp(SalePrice)) %>%
write_csv(get_path("sm_ljx_180527.csv")) %>%
select(everything())bind_rows(
tibble(
y = train_xgb_trans$train$SalePrice,
yhat = predict(train_xgb_trans_mod, train_xgb_trans$dtrain)
) %>%
mutate(tag = 'train'),
tibble(
y = train_xgb_trans$test$SalePrice,
yhat = predict(train_xgb_trans_mod, train_xgb_trans$dtest)
) %>%
mutate(tag = 'test')
) %>%
mutate(yhat_bin = ntile(yhat,20)) %>%
group_by(tag,yhat_bin) %>%
summarise(y = mean(y), yhat = mean(yhat)) %>%
gather(key,value,y:yhat) %>%
ggplot(aes(x = yhat_bin,y = value, col = key)) +
geom_line() +
facet_wrap(~ tag) +
theme_ilo() +
labs(
x = "predicted binned value",
y = "actual value",
title = "lift curve on train and validation set",
subtitle = "binning by ntile function in ggplot2 package",
caption = "Jiaxiang Li - jiaxiangli.netlify.com"
)theme_ilo函数参考 ggplot2使用技巧。xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(),
model = train_xgb_trans_mod) %>%
xgb.plot.importance()相关口径解释见 xgboost的理解。
Your submission scored 0.15594, which is not an improvement of your best score. Keep trying!
结果没有提升,因此y变量在树模型上不需要做变换。
eda_data %>%
select(PoolQC,PoolArea) %>%
group_by(PoolQC,PoolArea) %>%
count() %>%
spread(PoolQC,n) %>%
filter(!is.na(`<NA>`))## # A tibble: 4 x 5
## # Groups: PoolArea [4]
## PoolArea Ex Fa Gd `<NA>`
## <int> <int> <int> <int> <int>
## 1 0 NA NA NA 2906
## 2 368 NA NA NA 1
## 3 444 NA NA NA 1
## 4 561 NA NA NA 1
PoolArea = 0表示没有游泳池,因此无法给出PoolQC,因此2906个缺失值作为一个level没有问题。 但是有PoolQC有三个缺失值是拥有PoolArea值的,因此应该是漏记了。 (Owen 2017)
因此这里需要将这三个缺失值修改为Ex、Fa、Gd的其中一种。
eda_data %>%
group_by(PoolQC) %>%
summarise(PoolArea = mean(PoolArea) %>% accounting())## # A tibble: 4 x 2
## PoolQC PoolArea
## <chr> <S3: formattable>
## 1 Ex 359.75
## 2 Fa 583.50
## 3 Gd 648.50
## 4 <NA> 0.47
我们发现三种类别的PoolArea平均值如上,按照就近原则, 依次附上
Ex、Fa、Fa 或者Ex、Ex、FaEx、Fa、Faeda_data_ipt_PQC <-
eda_data %>%
# filter(PoolArea %in% c(368,444,561)) %>%
mutate(PoolQC =
case_when(
PoolArea == 368 ~ 'Ex',
PoolArea == 444 ~ 'Fa',
PoolArea == 561 ~ 'Fa',
TRUE ~ PoolQC
))eda_data_ipt_PQC_xgb <- get_watchlist(
eda_data_ipt_PQC %>%
semi_join(train,by = "Id")
)eda_data_ipt_PQC_xgb_mod <- xgb.train(
data = eda_data_ipt_PQC_xgb$dtrain,
# 1
eta = 0.1,
nround = 2000,
# 2
max_depth = 7,
min_child_weight = 17,
gamma = 0.72,
# 3
subsample = 0.8,
colsample_bytree = 0.95,
# 评价标准
# eval.metric = "error",
eval.metric = "rmse",
# eval.metric = ks_value,
# eval.metric = "auc",
# eval.metric = "logloss",
# objective
objective = "reg:linear", # 这是一个回归问题
# 其他
seed = 596,
watchlist = eda_data_ipt_PQC_xgb$watchlist,
# 300万数据一起用!
nfold = 2,
early.stop = 50,
nthread = 8
)xgb.save(eda_data_ipt_PQC_xgb_mod, file.path("required_data","eda_data_ipt_PQC_xgb_mod.model"))eda_data_ipt_PQC_xgb_mod <- xgb.load(file.path("required_data","eda_data_ipt_PQC_xgb_mod.model"))dtest <- xgb.DMatrix(data = data.matrix(
eda_data_ipt_PQC %>%
semi_join(test,by = "Id") %>%
select(-Id)))sm_ljx_180528 <-
tibble(
Id = test$Id,
SalePrice = predict(eda_data_ipt_PQC_xgb_mod, dtest)
) %>%
write_csv(get_path("sm_ljx_180528.csv"))
sm_ljx_180528bind_rows(
tibble(
y = eda_data_ipt_PQC_xgb$train$SalePrice,
yhat = predict(eda_data_ipt_PQC_xgb_mod, eda_data_ipt_PQC_xgb$dtrain)
) %>%
mutate(tag = 'train'),
tibble(
y = eda_data_ipt_PQC_xgb$test$SalePrice,
yhat = predict(eda_data_ipt_PQC_xgb_mod, eda_data_ipt_PQC_xgb$dtest)
) %>%
mutate(tag = 'test')
) %>%
mutate(yhat_bin = ntile(yhat,20)) %>%
group_by(tag,yhat_bin) %>%
summarise(y = mean(y), yhat = mean(yhat)) %>%
gather(key,value,y:yhat) %>%
ggplot(aes(x = yhat_bin,y = value, col = key)) +
geom_line() +
facet_wrap(~ tag) +
theme_ilo() +
labs(
x = "predicted binned value",
y = "actual value",
title = "lift curve on train and validation set",
subtitle = "binning by ntile function in ggplot2 package",
caption = "Jiaxiang Li - jiaxiangli.netlify.com"
)theme_ilo函数参考 ggplot2使用技巧。xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(),
model = eda_data_ipt_PQC_xgb_mod) %>%
xgb.plot.importance()相关口径解释见 xgboost的理解。
Your submission scored 0.14340, which is not an improvement of your best score. Keep trying!
结果没有提升,因此y变量在树模型上不需要做变换。
Ex、Ex、Faeda_data_ipt_PQC2 <-
eda_data %>%
# filter(PoolArea %in% c(368,444,561)) %>%
mutate(PoolQC =
case_when(
PoolArea == 368 ~ 'Ex',
PoolArea == 444 ~ 'Ex',
PoolArea == 561 ~ 'Fa',
TRUE ~ PoolQC
))eda_data_ipt_PQC2_xgb <- get_watchlist(
eda_data_ipt_PQC2 %>%
semi_join(train,by = "Id")
)eda_data_ipt_PQC2_xgb_mod <- xgb.train(
data = eda_data_ipt_PQC2_xgb$dtrain,
# 1
eta = 0.1,
nround = 2000,
# 2
max_depth = 7,
min_child_weight = 17,
gamma = 0.72,
# 3
subsample = 0.8,
colsample_bytree = 0.95,
# 评价标准
# eval.metric = "error",
eval.metric = "rmse",
# eval.metric = ks_value,
# eval.metric = "auc",
# eval.metric = "logloss",
# objective
objective = "reg:linear", # 这是一个回归问题
# 其他
seed = 596,
watchlist = eda_data_ipt_PQC2_xgb$watchlist,
# 300万数据一起用!
nfold = 2,
early.stop = 50,
nthread = 8
)xgb.save(eda_data_ipt_PQC2_xgb_mod, file.path("required_data","eda_data_ipt_PQC2_xgb_mod.model"))eda_data_ipt_PQC2_xgb_mod <- xgb.load(file.path("required_data","eda_data_ipt_PQC2_xgb_mod.model"))dtest <- xgb.DMatrix(data = data.matrix(
eda_data_ipt_PQC2 %>%
semi_join(test,by = "Id") %>%
select(-Id)))sm_ljx_180528_02 <-
tibble(
Id = test$Id,
SalePrice = predict(eda_data_ipt_PQC2_xgb_mod, dtest)
) %>%
write_csv(get_path("sm_ljx_180528_02.csv"))
sm_ljx_180528_02bind_rows(
tibble(
y = eda_data_ipt_PQC2_xgb$train$SalePrice,
yhat = predict(eda_data_ipt_PQC2_xgb_mod, eda_data_ipt_PQC2_xgb$dtrain)
) %>%
mutate(tag = 'train'),
tibble(
y = eda_data_ipt_PQC2_xgb$test$SalePrice,
yhat = predict(eda_data_ipt_PQC2_xgb_mod, eda_data_ipt_PQC2_xgb$dtest)
) %>%
mutate(tag = 'test')
) %>%
mutate(yhat_bin = ntile(yhat,20)) %>%
group_by(tag,yhat_bin) %>%
summarise(y = mean(y), yhat = mean(yhat)) %>%
gather(key,value,y:yhat) %>%
ggplot(aes(x = yhat_bin,y = value, col = key)) +
geom_line() +
facet_wrap(~ tag) +
theme_ilo() +
labs(
x = "predicted binned value",
y = "actual value",
title = "lift curve on train and validation set",
subtitle = "binning by ntile function in ggplot2 package",
caption = "Jiaxiang Li - jiaxiangli.netlify.com"
)theme_ilo函数参考 ggplot2使用技巧。xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(),
model = eda_data_ipt_PQC2_xgb_mod) %>%
xgb.plot.importance()相关口径解释见 xgboost的理解。
Your submission scored 0.14340, which is not an improvement of your best score. Keep trying!
结果没有提升,因此y变量在树模型上不需要做变换。
train_xgb_more_regular <- get_watchlist(train)train_xgb_more_regular_mod <- xgb.train(
data = train_xgb_more_regular$dtrain,
# 1
eta = 0.002,
nround = 20000,
# 2
max_depth = 4,
min_child_weight = 1,
gamma = 0.5,
# 3
subsample = 0.5,
colsample_bytree = 0.5,
# 评价标准
# eval.metric = "error",
eval.metric = "rmse",
# eval.metric = ks_value,
# eval.metric = "auc",
# eval.metric = "logloss",
# objective
objective = "reg:gamma", # 这是一个回归问题
# 其他
seed = 596,
watchlist = train_xgb_more_regular$watchlist,
# 300万数据一起用!
nfold = 2,
early.stop = 50,
nthread = 8
)xgb.save(train_xgb_more_regular_mod, file.path("required_data","train_xgb_more_regular_mod.model"))train_xgb_more_regular_mod <- xgb.load(file.path("required_data","train_xgb_more_regular_mod.model"))dtest <- xgb.DMatrix(data = data.matrix(test %>% select(-Id)))sm_ljx_180529 <-
tibble(
Id = test$Id,
SalePrice = predict(train_xgb_more_regular_mod, dtest)
) %>%
write_csv(get_path("sm_ljx_180529.csv")) %>%
select(everything())bind_rows(
tibble(
y = train_xgb_more_regular$train$SalePrice,
yhat = predict(train_xgb_more_regular_mod, train_xgb_more_regular$dtrain)
) %>%
mutate(tag = 'train'),
tibble(
y = train_xgb_more_regular$test$SalePrice,
yhat = predict(train_xgb_more_regular_mod, train_xgb_more_regular$dtest)
) %>%
mutate(tag = 'test')
) %>%
mutate(yhat_bin = ntile(yhat,20)) %>%
group_by(tag,yhat_bin) %>%
summarise(y = mean(y), yhat = mean(yhat)) %>%
gather(key,value,y:yhat) %>%
ggplot(aes(x = yhat_bin,y = value, col = key)) +
geom_line() +
facet_wrap(~ tag) +
theme_ilo() +
labs(
x = "predicted binned value",
y = "actual value",
title = "lift curve on train and validation set",
subtitle = "binning by ntile function in ggplot2 package",
caption = "Jiaxiang Li - jiaxiangli.netlify.com"
)theme_ilo函数参考 ggplot2使用技巧。xgb.importance(feature_names = train %>% select(-Id,-SalePrice) %>% names(),
model = train_xgb_more_regular_mod) %>%
xgb.plot.importance()相关口径解释见 xgboost的理解。
best_iteration: 11555
best_ntreelimit: 11555
best_score: 30228.03
Your submission scored 0.13871, which is not an improvement of your best score. Keep trying!
reg:linear这些都不是问题,关键是Python前面的特征工程,好好研究。
reg:gamma in Python思路参考CSDN博客。 DMLC (2016) 提供了模型保存和录入等函数。 这次建模特点是
You advanced 851 places on the leaderboard! Your submission scored 0.12124, which is an improvement of your previous score of 0.13513. Great job!
reg:gammaYou advanced 9 places on the leaderboard! Your submission scored 0.12112, which is an improvement of your previous score of 0.12124. Great job!
reg:gammaYour submission scored 0.12178, which is not an improvement of your best score. Keep trying!
1h 29m 56s使用的reg:gamma
[99999] train-error:0.020631 test-error:0.020631
看来增加迭代次数的效果已经体现不出来了。 分析的原因是最后的round不够,学习率比较慢,这次扩大到,200000次,保持\(\eta = 0.0004\)。
Your submission scored 0.12247, which is not an improvement of your best score. Keep trying!
5h 2m 15s使用的reg:gamma
[199999] train-error:0.004858 test-error:0.004858
增加迭代次数的红利消失了。
file.path(getwd(),"required_data") %>%
list.files(full.names = T) %>%
str_subset(".csv") %>%
str_subset("^(?!.*train.csv|.*test.csv)") %>%
tibble(path = .) %>%
mutate(shortpath =
str_remove(path,
"/Users/JiaxiangLi/Downloads/me/trans/housingPrices/required_data/")) %>%
mutate(data = map(.x = path, .f = read_csv)) %>%
mutate(size = map_dbl(.x = data, .f = object_size)) %>%
select(-path) %>%
filter(size <= 19832) %>%
select(-size) %>%
unnest() %>%
group_by(Id) %>%
summarise(SalePrice = median(unique(SalePrice))) %>%
write_csv(get_path("sm_ljx_180530.csv")) %>%
select(everything())## # A tibble: 1,459 x 2
## Id SalePrice
## <int> <dbl>
## 1 1461 126166.
## 2 1462 155094.
## 3 1463 177444.
## 4 1464 183861.
## 5 1465 187714.
## 6 1466 175037.
## 7 1467 170139.
## 8 1468 166203.
## 9 1469 175074.
## 10 1470 120218.
## # ... with 1,449 more rows
Your submission scored 0.12545, which is not an improvement of your best score. Keep trying!
ensemble_data_train_nest <-
file.path(getwd(),"required_data") %>%
list.files(full.names = T) %>%
str_subset("train_pred.csv") %>%
tibble(path = .) %>%
mutate(shortpath =
str_remove(path,
"/Users/JiaxiangLi/Downloads/me/trans/housingPrices/required_data/")) %>%
mutate(data = map(.x = path, .f = read_csv)) %>%
mutate(size = map_dbl(.x = data, .f = object_size)) %>%
select(-path) %>%
select(-size) expand.grid(
a = ensemble_data_train_nest$shortpath,
b = ensemble_data_train_nest$shortpath
) %>%
left_join(
ensemble_data_train_nest, by = c("a" = "shortpath")
) %>%
rename(data_a = data) %>%
left_join(
ensemble_data_train_nest, by = c("b" = "shortpath")
) %>%
rename(data_b = data) %>%
mutate(equal = map2(.x = data_a, .y = data_b, .f = setequal)) %>%
select(-data_a,-data_b) %>%
.$equalensemble_data_train <-
ensemble_data_train_nest %>%
unnest() %>%
spread(shortpath,SalePrice) %>%
add_column(SalePrice = train$SalePrice) %>%
select(-Id) %>%
select(SalePrice, everything())log_gamma_glm <- glm(SalePrice ~ ., data = ensemble_data_train, family=Gamma(link="log"))
summary(log_gamma_glm)##
## Call:
## glm(formula = SalePrice ~ ., family = Gamma(link = "log"), data = ensemble_data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.07134 -0.02873 0.02377 0.06494 0.20621
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 3.014e+00 1.222e+01 0.247
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -1.056e-05 5.503e-06 -1.919
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA NA NA
## python_xgb_mod_100k_round_train_pred.csv -6.682e-06 1.872e-05 -0.357
## python_xgb_mod_200k_round_train_pred.csv 1.166e-05 1.252e-06 9.318
## python_xgb_mod_40k_round_train_pred.csv -1.014e-05 1.616e-05 -0.627
## python_xgb_mod_train_pred.csv 1.002e-05 1.323e-05 0.757
## train_xgb_mod_train_pred.csv NA NA NA
## train_xgb_more_regular_mod_train_pred.csv -2.837e-05 5.197e-06 -5.458
## train_xgb_trans_mod_train_pred.csv 8.825e-01 1.125e+00 0.784
## Pr(>|t|)
## (Intercept) 0.8052
## eda_data_ipt_PQC_xgb_mod_train_pred.csv 0.0552 .
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA
## python_xgb_mod_100k_round_train_pred.csv 0.7212
## python_xgb_mod_200k_round_train_pred.csv < 2e-16 ***
## python_xgb_mod_40k_round_train_pred.csv 0.5306
## python_xgb_mod_train_pred.csv 0.4492
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv 5.66e-08 ***
## train_xgb_trans_mod_train_pred.csv 0.4330
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.01044164)
##
## Null deviance: 238.757 on 1459 degrees of freedom
## Residual deviance: 19.334 on 1452 degrees of freedom
## AIC: 32961
##
## Number of Fisher Scoring iterations: 6
\(\Box\)gamma 预测有问题。
lm <- lm(SalePrice ~ ., data = ensemble_data_train)
summary(lm)##
## Call:
## lm(formula = SalePrice ~ ., data = ensemble_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2337.59 -163.56 7.08 166.17 2807.83
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -4.705e+04 3.754e+04 -1.253
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -1.536e-02 1.691e-02 -0.909
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA NA NA
## python_xgb_mod_100k_round_train_pred.csv -1.839e-01 5.752e-02 -3.197
## python_xgb_mod_200k_round_train_pred.csv 1.271e+00 3.846e-03 330.379
## python_xgb_mod_40k_round_train_pred.csv -7.001e-02 4.966e-02 -1.410
## python_xgb_mod_train_pred.csv -1.691e-02 4.066e-02 -0.416
## train_xgb_mod_train_pred.csv NA NA NA
## train_xgb_more_regular_mod_train_pred.csv 8.518e-03 1.597e-02 0.533
## train_xgb_trans_mod_train_pred.csv 4.298e+03 3.457e+03 1.243
## Pr(>|t|)
## (Intercept) 0.21023
## eda_data_ipt_PQC_xgb_mod_train_pred.csv 0.36374
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA
## python_xgb_mod_100k_round_train_pred.csv 0.00142 **
## python_xgb_mod_200k_round_train_pred.csv < 2e-16 ***
## python_xgb_mod_40k_round_train_pred.csv 0.15881
## python_xgb_mod_train_pred.csv 0.67753
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv 0.59379
## train_xgb_trans_mod_train_pred.csv 0.21392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 313.9 on 1452 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.335e+07 on 7 and 1452 DF, p-value: < 2.2e-16
ensemble_data_train %>%
mutate(pred = predict(lm)) %>%
select(SalePrice,pred)## # A tibble: 1,460 x 2
## SalePrice pred
## <int> <dbl>
## 1 208500 208895.
## 2 181500 182218.
## 3 223500 223847.
## 4 140000 139579.
## 5 250000 249847.
## 6 143000 143019.
## 7 307000 306872.
## 8 200000 199783.
## 9 129900 129936.
## 10 118000 118085.
## # ... with 1,450 more rows
ensemble_data_test_nest <-
file.path(getwd(),"required_data") %>%
list.files(full.names = T) %>%
str_subset("test_pred.csv") %>%
tibble(path = .) %>%
mutate(shortpath =
str_remove(path,
"/Users/JiaxiangLi/Downloads/me/trans/housingPrices/required_data/")) %>%
mutate(data = map(.x = path, .f = read_csv)) %>%
mutate(size = map_dbl(.x = data, .f = object_size)) %>%
select(-path) %>%
select(-size) ensemble_data_test <-
ensemble_data_test_nest %>%
unnest() %>%
mutate(shortpath = str_replace_all(shortpath, "test","train")) %>%
spread(shortpath,SalePrice)
ensemble_data_test %>% cor()## Id
## Id 1.00000000
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -0.01591978
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv -0.01591978
## python_xgb_mod_100k_round_train_pred.csv -0.04588706
## python_xgb_mod_200k_round_train_pred.csv -0.04653884
## python_xgb_mod_40k_round_train_pred.csv -0.04631311
## python_xgb_mod_train_pred.csv -0.04591920
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## eda_data_ipt_PQC_xgb_mod_train_pred.csv
## Id -0.01591978
## eda_data_ipt_PQC_xgb_mod_train_pred.csv 1.00000000
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv 1.00000000
## python_xgb_mod_100k_round_train_pred.csv -0.04787718
## python_xgb_mod_200k_round_train_pred.csv -0.04823463
## python_xgb_mod_40k_round_train_pred.csv -0.04758602
## python_xgb_mod_train_pred.csv -0.04830702
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv
## Id -0.01591978
## eda_data_ipt_PQC_xgb_mod_train_pred.csv 1.00000000
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv 1.00000000
## python_xgb_mod_100k_round_train_pred.csv -0.04787718
## python_xgb_mod_200k_round_train_pred.csv -0.04823463
## python_xgb_mod_40k_round_train_pred.csv -0.04758602
## python_xgb_mod_train_pred.csv -0.04830702
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## python_xgb_mod_100k_round_train_pred.csv
## Id -0.04588706
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -0.04787718
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv -0.04787718
## python_xgb_mod_100k_round_train_pred.csv 1.00000000
## python_xgb_mod_200k_round_train_pred.csv 0.99990807
## python_xgb_mod_40k_round_train_pred.csv 0.99993137
## python_xgb_mod_train_pred.csv 0.99987879
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## python_xgb_mod_200k_round_train_pred.csv
## Id -0.04653884
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -0.04823463
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv -0.04823463
## python_xgb_mod_100k_round_train_pred.csv 0.99990807
## python_xgb_mod_200k_round_train_pred.csv 1.00000000
## python_xgb_mod_40k_round_train_pred.csv 0.99987920
## python_xgb_mod_train_pred.csv 0.99983766
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## python_xgb_mod_40k_round_train_pred.csv
## Id -0.04631311
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -0.04758602
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv -0.04758602
## python_xgb_mod_100k_round_train_pred.csv 0.99993137
## python_xgb_mod_200k_round_train_pred.csv 0.99987920
## python_xgb_mod_40k_round_train_pred.csv 1.00000000
## python_xgb_mod_train_pred.csv 0.99984427
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## python_xgb_mod_train_pred.csv
## Id -0.04591920
## eda_data_ipt_PQC_xgb_mod_train_pred.csv -0.04830702
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv -0.04830702
## python_xgb_mod_100k_round_train_pred.csv 0.99987879
## python_xgb_mod_200k_round_train_pred.csv 0.99983766
## python_xgb_mod_40k_round_train_pred.csv 0.99984427
## python_xgb_mod_train_pred.csv 1.00000000
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## train_xgb_mod_train_pred.csv
## Id NA
## eda_data_ipt_PQC_xgb_mod_train_pred.csv NA
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA
## python_xgb_mod_100k_round_train_pred.csv NA
## python_xgb_mod_200k_round_train_pred.csv NA
## python_xgb_mod_40k_round_train_pred.csv NA
## python_xgb_mod_train_pred.csv NA
## train_xgb_mod_train_pred.csv 1
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv
## Id NA
## eda_data_ipt_PQC_xgb_mod_train_pred.csv NA
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA
## python_xgb_mod_100k_round_train_pred.csv NA
## python_xgb_mod_200k_round_train_pred.csv NA
## python_xgb_mod_40k_round_train_pred.csv NA
## python_xgb_mod_train_pred.csv NA
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv 1
## train_xgb_trans_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv
## Id NA
## eda_data_ipt_PQC_xgb_mod_train_pred.csv NA
## eda_data_ipt_PQC2_xgb_mod_train_pred.csv NA
## python_xgb_mod_100k_round_train_pred.csv NA
## python_xgb_mod_200k_round_train_pred.csv NA
## python_xgb_mod_40k_round_train_pred.csv NA
## python_xgb_mod_train_pred.csv NA
## train_xgb_mod_train_pred.csv NA
## train_xgb_more_regular_mod_train_pred.csv NA
## train_xgb_trans_mod_train_pred.csv 1
ensemble_data_output <-
ensemble_data_test %>%
mutate(
SalePrice = predict(lm, newdata = ensemble_data_test)
) %>%
select(Id,SalePrice,everything()) %>%
select(Id,SalePrice) %>%
write_csv(get_path("sm_ljx_180531.csv"))mutate(shortpath = str_replace(shortpath, "test","train")) %>%
统一训练集和测试集x变量的名称。
Your submission scored 0.75198, which is not an improvement of your best score. Keep trying!
效果不好。 主要是线性结构需要变量之间的相关性比较低,但是这里的相关性都比较高。
模型选取、特征工程参考 Roberts (2018) 和 Serigne (2017)。 相关尝试比较多,这里就不展开了,下面每篇推荐了三个点可以进行尝试,相关代码,直接跳转到参考文献,即可查询。 测试前建议安装好Python环境、Xgboost和Lightgbm。
Roberts (2018) 给出了三点新颖的思路:
还有其他一些改进思路,未进行验证:
Your submission scored 0.11556, which is not an improvement of your best score. Keep trying!
参考 Roberts (2018) 的思路,成绩有所提升。
Serigne (2017) 给出了三点新颖的思路:
You advanced 684 places on the leaderboard! Your submission scored 0.11549, which is an improvement of your previous score of 0.12124. Great job!
参考 Serigne (2017) 的思路,成绩有所提升。
setequal(sm_ljx_180525,sm_ljx_180527)
setequal(sm_ljx_180525,sm_ljx_180528)
setequal(sm_ljx_180525,sm_ljx_180528_02)@ref(PQC)没有改变预测结果。
xgboost过拟合是一个问题。 除了模型 @(model-python)没有过拟合外,其他的结果都是过拟合的。
reg:gamma过拟合下降,train和test差异不大。DMLC. 2016. “Python Package Introduction.” http://xgboost.readthedocs.io/en/latest/python/python_intro.html.
Owen, M. Aaron. 2017. “Kaggle’s Advanced Regression Competition: Predicting Housing Prices in Ames, Iowa.” https://www.r-bloggers.com/kaggles-advanced-regression-competition-predicting-housing-prices-in-ames-iowa/.
Roberts, Jack. 2018. “Top 7.” https://www.kaggle.com/jack89roberts/top-7-using-elasticnet-with-interactions.
Serigne. 2017. “Stacked Regressions : Top 4.” https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard.
输入code
names(train) %>% str_subset("^[:digit:]")
发现变量
[1] "1stFlrSF" "2ndFlrSF" "3SsnPorch"
是数字开头的不符合R命名规则,统一更改。↩
实际上是RMSLE,因为@Roberts2018 对y进行了log处理。↩